from pyspark import SparkConf, SparkContext
import os
os.environ['PYSPARK_PYTHON'] = "C:/Users/28474/AppData/Local/Programs/Python/Python310/python.exe"

conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)
# 2. 读取数据文件
rdd = sc.textFile("E:/storage/study/Python/python-learn/day01/02-python-learn/13_pyspark/素材/hello.txt")
# 3. 取出全部单词
word_line = rdd.flatMap(lambda x: x.split(" "))
#4. 将所有单词都转换成二元元组，单位为Key, value设置为1
word_with_count = word_line.map(lambda x: (x,1))
#5. 分组并求和
res = word_with_count.reduceByKey(lambda a, b: a+b)
#6. 对结果进行排序
final_rdd = res.sortBy(lambda x: x[1], ascending=False, numPartitions=1)
print(final_rdd.collect())

